import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import requests
import glob
import re
import xlwings
import math
import datetime
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
pd.set_option('display.max_columns', None)
Import depuis le site du vendeeglobe
r = requests.get('https://www.vendeeglobe.org/fr/classement')
r.status_code
content = r.content.decode('utf-8')
soup = BeautifulSoup(content)
On cherche tous les liens de la forme suivante https://www.vendeeglobe.org/download-race-data/vendeeglobe_20210304_170000.xlsx
l = soup.find('select', attrs={'class': 'form__input'}).text.split('- ')[1:]
def format_files(s):
res = s.replace('-', '').replace(':', '').strip().replace(' ', '_')
res = 'https://www.vendeeglobe.org/download-race-data/vendeeglobe_' + res + '.xlsx'
return res
res = list(map(format_files, l)) # La liste contenant tous les chemins des fichiers excel à télécharger
La cellule suivante permet de charger tous les fichiers Excel, elle est désactivée car utile une seule fois
# save_path = 'data'
# for link in res:
# resp = requests.get(link)
# file_name = str(link)[-20:]
# completeName = os.path.join(save_path, file_name)
# output = open(completeName, 'wb')
# output.write(resp.content)
# output.close()
Import depuis le glossaire du site du vendee globe
r_voiliers = requests.get('https://www.vendeeglobe.org/fr/glossaire')
# r_voiliers.status_code
content = r_voiliers.content.decode('utf-8')
soup_voiliers = BeautifulSoup(content)
header_voilier = []
for div_voilier in soup_voiliers.find_all('div', attrs={'class':'sv-g'})[1].find_all('div', attrs={'class':'sv-u-1'})[:1]:
for li_voilier in div_voilier.find_all('li'):
header_voilier.append(re.findall('([^:]*) :', li_voilier.text)[0])
header_voilier.extend(["Skipper", "Nom"])
df_voiliers = pd.DataFrame(columns=header_voilier)
indx = 0
div_voiliers = soup_voiliers.find_all('div', attrs={'class':'sv-g'})[1].find_all('div', attrs={'class':'sv-u-1'})
div_skippers = soup_voiliers.find_all('div', attrs={'class' : 'boats-list__infos'})
for div_voilier, div_skipper in list(zip(div_voiliers, div_skippers)):
nom_voilier = div_skipper.find('h3').text
nom_skipper = div_skipper.find('span').text
values_to_add = {}
values_to_add["Skipper"] = nom_skipper
values_to_add["Nom"] = nom_voilier
for li in div_voilier.find_all('li'):
col = re.findall('([^:]*) :', li.text)[0]
value = re.findall(' :(.*)', li.text)[0]
values_to_add[col] = value
row_to_add = pd.Series(values_to_add, name=indx)
df_voiliers = df_voiliers.append(row_to_add)
indx += 1
# Code réalisé par Alexandre Le Bris pour régler un problème lié à 'xxid'
# lors de la lecture des fichiers excel par pandas
import tempfile
from zipfile import ZipFile
import shutil
import os
from fnmatch import fnmatch
import re
def change_in_zip(file_name, name_filter, change):
tempdir = tempfile.mkdtemp()
try:
tempname = os.path.join(tempdir, 'new.zip')
with ZipFile(file_name, 'r') as r, ZipFile(tempname, 'w') as w:
for item in r.infolist():
data = r.read(item.filename)
data = change(data)
w.writestr(item, data)
shutil.move(tempname, file_name)
finally:
shutil.rmtree(tempdir)
# def change_all():
# for filename in glob.glob('data/*'):
# change_in_zip(filename, name_filter='xl/styles.xml', change=lambda d: re.sub(b'xxid="\d*"', b"", d))
def xlsx_broken(file):
excel_app = xlwings.App(visible=False)
excel_book = excel_app.books.open(file)
excel_book.save()
excel_book.close()
excel_app.quit()
return
n2_change_all = len(glob.glob('data/*'))
def change_all(n1=0, n2 = n2_change_all):
for filename in glob.glob('data/*')[n1:n2]:
print(filename)
xlsx_broken(filename)
print("done")
# change_all(250) # à n'executer qu'une fois
col_vendee = ['Date', 'Rang', 'Nat/Voile',
'Skipper/Bateau', 'Heure_Fr', 'Lat',
'long', 'Cap_30m', 'Vitesse_30m',
'VMG_30m', 'Distance_30m'
, 'Cap_class', 'Vitesse_class',
'VMG_class', 'Distance_class'
, 'Cap_24h', 'Vitesse_24h',
'VMG_24h', 'Distance_24h'
, 'DTF', 'DTL']
### Construction de df_vendee en affichant une marque toutes les 50 itérations
l_df = []
l_banned_files = [r'data\20201108_120200.xlsx']
abs_path = r"C:\Users\Pierrick\OneDrive\Bureau\K\2_MS_TelecomParis\Cours\INFMDI721_Kit_Big_Data\PierrickLeroy\Projet_final\\"
cnt = 0
start = datetime.datetime.now()
for filename in glob.glob('data/*')[:483]:
cnt += 1
if cnt%50 == 0:
print(cnt, 'fichiers chargés en', datetime.datetime.now() - start)
if filename in l_banned_files:
continue
print(filename)
date = re.findall(r"\d{8}_\d{6}", filename)[0]
df = pd.read_excel(abs_path + filename
, skiprows = 5
, header = None
, names = col_vendee)
df['Date'] = date
l_df.append(df)
df_vendee = pd.concat(l_df)
data\20201108_140000.xlsx data\20201108_150000.xlsx data\20201108_170000.xlsx data\20201108_210000.xlsx data\20201109_040000.xlsx data\20201109_080000.xlsx data\20201109_110000.xlsx data\20201109_140000.xlsx data\20201109_170000.xlsx data\20201109_210000.xlsx data\20201110_040000.xlsx data\20201110_080000.xlsx data\20201110_110000.xlsx data\20201110_140000.xlsx data\20201110_170000.xlsx data\20201110_210000.xlsx data\20201111_040000.xlsx data\20201111_080000.xlsx data\20201111_110000.xlsx data\20201111_140000.xlsx data\20201111_170000.xlsx data\20201111_210000.xlsx data\20201112_040000.xlsx data\20201112_080000.xlsx data\20201112_110000.xlsx data\20201112_140000.xlsx data\20201112_170000.xlsx data\20201112_210000.xlsx data\20201113_040000.xlsx data\20201113_080000.xlsx data\20201113_110000.xlsx data\20201113_140000.xlsx data\20201113_170000.xlsx data\20201113_210000.xlsx data\20201114_040000.xlsx data\20201114_080000.xlsx data\20201114_110000.xlsx data\20201114_140000.xlsx data\20201114_170000.xlsx data\20201114_210000.xlsx data\20201115_040000.xlsx data\20201115_080000.xlsx data\20201115_110000.xlsx data\20201115_140000.xlsx data\20201115_170000.xlsx data\20201115_210000.xlsx data\20201116_040000.xlsx data\20201116_080000.xlsx 50 fichiers chargés en 0:00:05.736656 data\20201116_110000.xlsx data\20201116_140000.xlsx data\20201116_170000.xlsx data\20201116_210000.xlsx data\20201117_040000.xlsx data\20201117_080000.xlsx data\20201117_110000.xlsx data\20201117_140000.xlsx data\20201117_170000.xlsx data\20201117_210000.xlsx data\20201118_040000.xlsx data\20201118_080000.xlsx data\20201118_110000.xlsx data\20201118_140000.xlsx data\20201118_170000.xlsx data\20201118_210000.xlsx data\20201119_040000.xlsx data\20201119_080000.xlsx data\20201119_110000.xlsx data\20201119_140000.xlsx data\20201119_170000.xlsx data\20201119_210000.xlsx data\20201120_040000.xlsx data\20201120_080000.xlsx data\20201120_110000.xlsx data\20201120_140000.xlsx data\20201120_170000.xlsx data\20201120_210000.xlsx data\20201121_040000.xlsx data\20201121_080000.xlsx data\20201121_110000.xlsx data\20201121_140000.xlsx data\20201121_170000.xlsx data\20201121_210000.xlsx data\20201122_040000.xlsx data\20201122_080000.xlsx data\20201122_110000.xlsx data\20201122_140000.xlsx data\20201122_170000.xlsx data\20201122_210000.xlsx data\20201123_040000.xlsx data\20201123_080000.xlsx data\20201123_110000.xlsx data\20201123_140000.xlsx data\20201123_170000.xlsx data\20201123_210000.xlsx data\20201124_040000.xlsx data\20201124_080000.xlsx data\20201124_110000.xlsx data\20201124_140000.xlsx 100 fichiers chargés en 0:00:11.174114 data\20201124_170000.xlsx data\20201124_210000.xlsx data\20201125_040000.xlsx data\20201125_080000.xlsx data\20201125_110000.xlsx data\20201125_140000.xlsx data\20201125_170000.xlsx data\20201125_210000.xlsx data\20201126_040000.xlsx data\20201126_080000.xlsx data\20201126_110000.xlsx data\20201126_140000.xlsx data\20201126_170000.xlsx data\20201126_210000.xlsx data\20201127_040000.xlsx data\20201127_080000.xlsx data\20201127_110000.xlsx data\20201127_140000.xlsx data\20201127_170000.xlsx data\20201127_210000.xlsx data\20201128_040000.xlsx data\20201128_080000.xlsx data\20201128_110000.xlsx data\20201128_140000.xlsx data\20201128_170000.xlsx data\20201128_210000.xlsx data\20201129_040000.xlsx data\20201129_080000.xlsx data\20201129_110000.xlsx data\20201129_140000.xlsx data\20201129_170000.xlsx data\20201129_210000.xlsx data\20201130_040000.xlsx data\20201130_080000.xlsx data\20201130_110000.xlsx data\20201130_140000.xlsx data\20201130_170000.xlsx data\20201130_210000.xlsx data\20201201_040000.xlsx data\20201201_080000.xlsx data\20201201_110000.xlsx data\20201201_140000.xlsx data\20201201_170000.xlsx data\20201201_210000.xlsx data\20201202_040000.xlsx data\20201202_080000.xlsx data\20201202_110000.xlsx data\20201202_140000.xlsx data\20201202_170000.xlsx data\20201202_210000.xlsx 150 fichiers chargés en 0:00:17.377522 data\20201203_040000.xlsx data\20201203_080000.xlsx data\20201203_110000.xlsx data\20201203_140000.xlsx data\20201203_170000.xlsx data\20201203_210000.xlsx data\20201204_040000.xlsx data\20201204_080000.xlsx data\20201204_110000.xlsx data\20201204_140000.xlsx data\20201204_170000.xlsx data\20201204_210000.xlsx data\20201205_040000.xlsx data\20201205_080000.xlsx data\20201205_110000.xlsx data\20201205_140000.xlsx data\20201205_170000.xlsx data\20201205_210000.xlsx data\20201206_040000.xlsx data\20201206_080000.xlsx data\20201206_110000.xlsx data\20201206_140000.xlsx data\20201206_170000.xlsx data\20201206_210000.xlsx data\20201207_040000.xlsx data\20201207_080000.xlsx data\20201207_110000.xlsx data\20201207_140000.xlsx data\20201207_170000.xlsx data\20201207_210000.xlsx data\20201208_040000.xlsx data\20201208_080000.xlsx data\20201208_110000.xlsx data\20201208_140000.xlsx data\20201208_170000.xlsx data\20201208_210000.xlsx data\20201209_040000.xlsx data\20201209_080000.xlsx data\20201209_110000.xlsx data\20201209_140000.xlsx data\20201209_170000.xlsx data\20201209_210000.xlsx data\20201210_040000.xlsx data\20201210_080000.xlsx data\20201210_110000.xlsx data\20201210_140000.xlsx data\20201210_170000.xlsx data\20201210_210000.xlsx data\20201211_040000.xlsx data\20201211_080000.xlsx 200 fichiers chargés en 0:00:22.077949 data\20201211_110000.xlsx data\20201211_140000.xlsx data\20201211_170000.xlsx data\20201211_210000.xlsx data\20201212_040000.xlsx data\20201212_080000.xlsx data\20201212_110000.xlsx data\20201212_140000.xlsx data\20201212_170000.xlsx data\20201212_210000.xlsx data\20201213_040000.xlsx data\20201213_080000.xlsx data\20201213_110000.xlsx data\20201213_140000.xlsx data\20201213_170000.xlsx data\20201213_210000.xlsx data\20201214_040000.xlsx data\20201214_080000.xlsx data\20201214_110000.xlsx data\20201214_140000.xlsx data\20201214_170000.xlsx data\20201214_210000.xlsx data\20201215_040000.xlsx data\20201215_080000.xlsx data\20201215_110000.xlsx data\20201215_140000.xlsx data\20201215_170000.xlsx data\20201215_210000.xlsx data\20201216_040000.xlsx data\20201216_080000.xlsx data\20201216_110000.xlsx data\20201216_140000.xlsx data\20201216_170000.xlsx data\20201216_210000.xlsx data\20201217_040000.xlsx data\20201217_080000.xlsx data\20201217_110000.xlsx data\20201217_140000.xlsx data\20201217_170000.xlsx data\20201217_210000.xlsx data\20201218_040000.xlsx data\20201218_080000.xlsx data\20201218_110000.xlsx data\20201218_140000.xlsx data\20201218_170000.xlsx data\20201218_210000.xlsx data\20201219_040000.xlsx data\20201219_080000.xlsx data\20201219_110000.xlsx data\20201219_140000.xlsx 250 fichiers chargés en 0:00:27.125447 data\20201219_170000.xlsx data\20201219_210000.xlsx data\20201220_040000.xlsx data\20201220_080000.xlsx data\20201220_110000.xlsx data\20201220_140000.xlsx data\20201220_170000.xlsx data\20201220_210000.xlsx data\20201221_040000.xlsx data\20201221_080000.xlsx data\20201221_110000.xlsx data\20201221_140000.xlsx data\20201221_170000.xlsx data\20201221_210000.xlsx data\20201222_040000.xlsx data\20201222_080000.xlsx data\20201222_110000.xlsx data\20201222_140000.xlsx data\20201222_170000.xlsx data\20201222_210000.xlsx data\20201223_040000.xlsx data\20201223_080000.xlsx data\20201223_110000.xlsx data\20201223_140000.xlsx data\20201223_170000.xlsx data\20201223_210000.xlsx data\20201224_040000.xlsx data\20201224_080000.xlsx data\20201224_110000.xlsx data\20201224_140000.xlsx data\20201224_170000.xlsx data\20201224_210000.xlsx data\20201225_040000.xlsx data\20201225_080000.xlsx data\20201225_110000.xlsx data\20201225_140000.xlsx data\20201225_170000.xlsx data\20201225_210000.xlsx data\20201226_040000.xlsx data\20201226_080000.xlsx data\20201226_110000.xlsx data\20201226_140000.xlsx data\20201226_170000.xlsx data\20201226_210000.xlsx data\20201227_040000.xlsx data\20201227_080000.xlsx data\20201227_110000.xlsx data\20201227_140000.xlsx data\20201227_170000.xlsx data\20201227_210000.xlsx 300 fichiers chargés en 0:00:33.013699 data\20201228_040000.xlsx data\20201228_080000.xlsx data\20201228_110000.xlsx data\20201228_140000.xlsx data\20201228_170000.xlsx data\20201228_210000.xlsx data\20201229_040000.xlsx data\20201229_080000.xlsx data\20201229_110000.xlsx data\20201229_140000.xlsx data\20201229_170000.xlsx data\20201229_210000.xlsx data\20201230_040000.xlsx data\20201230_080000.xlsx data\20201230_110000.xlsx data\20201230_140000.xlsx data\20201230_170000.xlsx data\20201230_210000.xlsx data\20201231_040000.xlsx data\20201231_080000.xlsx data\20201231_110000.xlsx data\20201231_140000.xlsx data\20201231_170000.xlsx data\20201231_210000.xlsx data\20210101_040000.xlsx data\20210101_080000.xlsx data\20210101_110000.xlsx data\20210101_140000.xlsx data\20210101_170000.xlsx data\20210101_210000.xlsx data\20210102_040000.xlsx data\20210102_080000.xlsx data\20210102_110000.xlsx data\20210102_140000.xlsx data\20210102_170000.xlsx data\20210102_210000.xlsx data\20210103_040000.xlsx data\20210103_080000.xlsx data\20210103_110000.xlsx data\20210103_140000.xlsx data\20210103_170000.xlsx data\20210103_210000.xlsx data\20210104_040000.xlsx data\20210104_080000.xlsx data\20210104_110000.xlsx data\20210104_140000.xlsx data\20210104_170000.xlsx data\20210104_210000.xlsx data\20210105_040000.xlsx data\20210105_080000.xlsx 350 fichiers chargés en 0:00:40.249344 data\20210105_110000.xlsx data\20210105_140000.xlsx data\20210105_170000.xlsx data\20210105_210000.xlsx data\20210106_040000.xlsx data\20210106_080000.xlsx data\20210106_110000.xlsx data\20210106_140000.xlsx data\20210106_170000.xlsx data\20210106_210000.xlsx data\20210107_040000.xlsx data\20210107_080000.xlsx data\20210107_110000.xlsx data\20210107_140000.xlsx data\20210107_170000.xlsx data\20210107_210000.xlsx data\20210108_040000.xlsx data\20210108_080000.xlsx data\20210108_110000.xlsx data\20210108_140000.xlsx data\20210108_170000.xlsx data\20210108_210000.xlsx data\20210109_040000.xlsx data\20210109_080000.xlsx data\20210109_110000.xlsx data\20210109_140000.xlsx data\20210109_170000.xlsx data\20210109_210000.xlsx data\20210110_040000.xlsx data\20210110_080000.xlsx data\20210110_110000.xlsx data\20210110_140000.xlsx data\20210110_170000.xlsx data\20210110_210000.xlsx data\20210111_040000.xlsx data\20210111_080000.xlsx data\20210111_110000.xlsx data\20210111_140000.xlsx data\20210111_170000.xlsx data\20210111_210000.xlsx data\20210112_040000.xlsx data\20210112_080000.xlsx data\20210112_110000.xlsx data\20210112_140000.xlsx data\20210112_170000.xlsx data\20210112_210000.xlsx data\20210113_040000.xlsx data\20210113_080000.xlsx data\20210113_110000.xlsx data\20210113_140000.xlsx 400 fichiers chargés en 0:00:46.559468 data\20210113_170000.xlsx data\20210113_210000.xlsx data\20210114_040000.xlsx data\20210114_080000.xlsx data\20210114_110000.xlsx data\20210114_140000.xlsx data\20210114_170000.xlsx data\20210114_210000.xlsx data\20210115_040000.xlsx data\20210115_080000.xlsx data\20210115_110000.xlsx data\20210115_140000.xlsx data\20210115_170000.xlsx data\20210115_210000.xlsx data\20210116_040000.xlsx data\20210116_080000.xlsx data\20210116_110000.xlsx data\20210116_140000.xlsx data\20210116_170000.xlsx data\20210116_210000.xlsx data\20210117_040000.xlsx data\20210117_080000.xlsx data\20210117_110000.xlsx data\20210117_140000.xlsx data\20210117_170000.xlsx data\20210117_210000.xlsx data\20210118_040000.xlsx data\20210118_080000.xlsx data\20210118_110000.xlsx data\20210118_140000.xlsx data\20210118_170000.xlsx data\20210118_210000.xlsx data\20210119_040000.xlsx data\20210119_080000.xlsx data\20210119_110000.xlsx data\20210119_140000.xlsx data\20210119_170000.xlsx data\20210119_210000.xlsx data\20210120_040000.xlsx data\20210120_080000.xlsx data\20210120_110000.xlsx data\20210120_140000.xlsx data\20210120_170000.xlsx data\20210120_210000.xlsx data\20210121_040000.xlsx data\20210121_080000.xlsx data\20210121_110000.xlsx data\20210121_140000.xlsx data\20210121_170000.xlsx data\20210121_210000.xlsx 450 fichiers chargés en 0:00:51.990941 data\20210122_040000.xlsx data\20210122_080000.xlsx data\20210122_110000.xlsx data\20210122_140000.xlsx data\20210122_170000.xlsx data\20210122_210000.xlsx data\20210123_040000.xlsx data\20210123_080000.xlsx data\20210123_110000.xlsx data\20210123_140000.xlsx data\20210123_170000.xlsx data\20210123_210000.xlsx data\20210124_040000.xlsx data\20210124_080000.xlsx data\20210124_110000.xlsx data\20210124_140000.xlsx data\20210124_170000.xlsx data\20210124_210000.xlsx data\20210125_040000.xlsx data\20210125_080000.xlsx data\20210125_110000.xlsx data\20210125_140000.xlsx data\20210125_170000.xlsx data\20210125_210000.xlsx data\20210126_040000.xlsx data\20210126_080000.xlsx data\20210126_110000.xlsx data\20210126_140000.xlsx data\20210126_170000.xlsx data\20210126_210000.xlsx data\20210127_040000.xlsx data\20210127_080000.xlsx data\20210127_110000.xlsx data\20210127_140000.xlsx
def correction_NAT(s):
'''
Corrige la nationalité des skippers par exemple FRFRA -> FRA
Si nationalité correcte JPN -> JPN
'''
if type(s)==float and math.isnan(s):
return np.nan
if len(s)==3:
return s
else :
return s[-3:]
def extract_digits(s):
"""
Extraire les chiffres des colonnes d'intérêt pour les données de la course vendée globe
"""
if type(s)==float and math.isnan(s):
return np.nan
return re.findall('([-\d]*.\d*)[ °]',s)[0]
cols_digits = ['Cap_30m', 'Vitesse_30m', 'VMG_30m', 'Distance_30m', 'Cap_class', 'Vitesse_class'
, 'VMG_class', 'Distance_class', 'Cap_24h', 'Vitesse_24h', 'VMG_24h', 'Distance_24h', 'DTF', 'DTL']
def extract_voile(s):
"""
Fonction permettant d'extraire le numéro de voile à partir d'une chaîne de caractères
"""
if type(s)==float and math.isnan(s):
return np.nan
return re.findall('(\d{1,4})', s)[0]
#Fonction de conversion des coordonnées satellites en nombre
def dms2dd(degrees, minutes, seconds, direction):
dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
if direction == 'S' or direction == 'W':
dd *= -1
return dd;
def dd2dms(deg):
d = int(deg)
md = abs(deg - d) * 60
m = int(md)
sd = (md - m) * 60
return [d, m, sd]
def parse_dms(dms):
if type(dms)==float and math.isnan(dms):
return np.nan, np.nan
parts = re.split('[^\d\w]+', dms)
lat = dms2dd(parts[0], parts[1], parts[2], parts[3])
lng = dms2dd(parts[4], parts[5], parts[6], parts[7])
return (lat, lng)
def get_lat(dms):
return parse_dms(dms)[0]
def get_long(dms):
return parse_dms(dms)[1]
def preparation_voiliers(df):
df_voiliers = df.copy()
df_voiliers['Voile'] = df_voiliers['Numéro de voile'].apply(extract_voile)
df_voiliers['Voile'] = df_voiliers['Voile'].astype(float)
# Correction des numéros de voiles non harmonisés entre les fichiers
df_voiliers.loc[df_voiliers["Nom"] == 'LinkedOut', 'Voile'] = 59.0
df_voiliers.loc[df_voiliers["Nom"] == 'MEDALLIA', 'Voile'] = 777.0
df_voiliers.loc[df_voiliers["Skipper"] == 'Boris HERRMANN', 'Voile'] = 10.0
return df_voiliers
df_voiliers_prep = preparation_voiliers(df_voiliers)
def preparation_vendee(df):
df_vendee = df.copy()
df_vendee = df_vendee.dropna(subset=["Nat/Voile"]) # On supprime toutes les enregistrements n'ayant pas de skipper
df_vendee.loc[:, 'Date'] = pd.to_datetime(df_vendee.loc[:, 'Date'], format = '%Y%m%d_%H0000') # On parse la colonne de dates
df_vendee[['Nat','Voile']] = df_vendee['Nat/Voile'].str.replace('\n', '').str.split(' ',expand=True)
# Correction des erreurs de typographie sur la nationalité
df_vendee['Nat'] = df_vendee['Nat'].apply(correction_NAT)
# Création de 2 colonnes skipper et bateau
df_vendee[['Skipper','Bateau']] = df_vendee['Skipper/Bateau'].str.split('\n',expand=True)
# Récupération de toutes les données sous forme numérique
for col in cols_digits:
df_vendee[col] = df_vendee[col].apply(extract_digits)
# Suppression de colonnes inutiles (1)
df_vendee = df_vendee.drop(columns = ['Nat/Voile', 'Heure_Fr','Skipper/Bateau'])
df_vendee['Voile'] = df_vendee['Voile'].astype(float)
# Jointure avec les informations des voiliers
df_vendee = df_vendee.merge(df_voiliers_prep , how = 'left', left_on = 'Voile', right_on = 'Voile', copy = False)
# Suppression de colonnes inutiles (2) et processing
df_vendee = df_vendee.drop(columns = ['Skipper_x', 'Bateau'])
df_vendee = df_vendee.rename(columns={"Skipper_y": "Skipper"})
df_vendee['Lat_float'] = (df_vendee['Lat'] +' '+ df_vendee['long']).apply(lambda x : get_lat(x))
df_vendee['long_float'] = (df_vendee['Lat'] +' '+ df_vendee['long']).apply(lambda x : get_long(x))
# Ajout d'une colonne rang final
df_vendee = df_vendee.merge(df_final_rank, how='left', left_on='Voile', right_on='Voile', copy = False, suffixes = [None, '_final'])
# Création d'une colonne sans valeurs nulles pour faciliter certaines visualisations
df_vendee['DTF'] = df_vendee['DTF'].astype(float)
df_vendee["DTF_filled"] = df_vendee.sort_values(by = ['Skipper', 'Date'])['DTF'].fillna(method='ffill')
# Ajout d'une colonne particulière pour l'affichage des couleurs dans une visualisation
df_vendee['RET'] = df_vendee['Rang'] == 'RET'
df_vendee.loc[df_vendee['Skipper'] == 'Yannick BESTAVEN', 'RET'] = 'Yannick BESTAVEN'
return df_vendee
df_vendee_prep = preparation_vendee(df_vendee)
Dans cette partie nous allons nous intéresser au vainqueur de l'édition, Yannick Bestaven et aux trajets suivis par les skippers. Nous allons aussi constater le rythme effréné de l'édition 2020/2021 entre arrivées dans un mouchoir de poche et nombreux abandons.
Commençons par afficher le trajet de Yannick par rapport à quelques uns de ses concurrents
# Afficher le parcours de tous les skippers avec Bestaven en rouge
def is_bestaven(s):
if s=='Yannick BESTAVEN':
return 'Yannick BESTAVEN'
else:
return 'Concurrent'
l_skipper = df_vendee_prep['Skipper'].unique()[10:20]
df = df_vendee_prep[df_vendee_prep['Skipper'].isin(l_skipper)]
df["is_Bestaven"] = df['Skipper'].apply(is_bestaven)
df = df.sort_values(by = ['Skipper', 'Date'])
fig = px.line_geo(df,
lat="Lat_float", lon = 'long_float',
color="is_Bestaven", # "continent" is one of the columns of gapminder
projection="orthographic")
fig.show()
<ipython-input-239-2653d0d5db90>:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
On peut déjà faire une observation : Yannick Bestaven semble avoir pu anticiper sur son tour de l'antarctique en faisant cap à l'est assez tôt dans la course.
Intéressons nous maintenant aux leaders de la courses et tentons de voir si ils ont pris une route différentes de leurs concurrents. Le chapeau 1 des skippers est composé des 5 skippers arrivés premiers, le chapeau 2 est constitué des autres Skippers
# Afficher le parcours avec le rang final comme couleur
df = df_vendee_prep
df = df.sort_values(by = ['Skipper', 'Date'])
df['Chapeau_final'] = np.digitize(df['Rang_final'].replace('RET', 26).astype(int), [1,5])
fig = px.line_geo(df,
lat="Lat_float", lon = 'long_float',
color="Chapeau_final", # "continent" is one of the columns of gapminder
projection="orthographic")
fig.show()
On remarque ici que tous les skippers on respecté la Zone d'Exclusion de l'Antarctique (ZEA), qui est une zone de sécurité délimitée autour de l'antarctique pour éviter aux skippers de rencontrer des icebergs
Source : Parcours sur Wikipédia https://fr.wikipedia.org/wiki/Vendée_Globe_2020-2021#Parcours
Il semble que tous les skippers aient suivi un chemin similaire (en première approximation), les différences se feront sans doute sur la vitesse de leur embarcation.
# Histogramme des vitesses globales, tout au long de la course
figure(figsize=(12, 6), dpi=80)
df_vendee_prep['Vitesse_class'].astype(float).plot.kde(label = 'Tous Skippers')
df_vendee_prep.loc[df_vendee_prep['Skipper'] == 'Yannick BESTAVEN', 'Vitesse_class'].astype(float).plot.kde(label = 'Yannick Bestaven')
df_vendee_prep.loc[df_vendee_prep['Skipper'] == 'Charlie DALIN', 'Vitesse_class'].astype(float).plot.kde(label = 'Charlie Dalin')
plt.xlabel('Vitesse')
plt.legend()
<matplotlib.legend.Legend at 0x1bc6546f5b0>
Le profil de vitesse sur la course de Yannick Bestaven est particulier puisqu'il semble avoir eu 2 régimes différents lors de la course, peut-être est-ce le marqueur de l'utilisation de foils ?
# KM restants en fonction du temps pour Bestaven
l_skipper = ['Boris HERRMANN', 'Louis BURTON', 'Yannick BESTAVEN']
figure(figsize=(12, 6), dpi=80)
df = df_vendee_prep.loc[df_vendee_prep['Skipper'].isin(l_skipper)]
df['DTF'] = df['DTF'].astype(float)
sns.lineplot(data=df, x="Date", y="DTF", hue = 'Skipper')
plt.title("Distance to finish en fonction de la date : une édition très disputée")
<ipython-input-440-283bef57f6d0>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Text(0.5, 1.0, 'Distance to finish en fonction de la date : une édition très disputée')
Yannick Bestaven semble avoir pris un peu d'avance courant Janvier, entre 13 000 et 5 000 km de l'arrivée mais tout s'est finalement joué au finish. L'édition 2020/2021 a été très disputée avec 8 skippers arrivés le même jour.
# Vitesse au fur et à mesure de la course, lissée
l_skipper = ['Boris HERRMANN', 'Louis BURTON', 'Yannick BESTAVEN']
# , 'Sébastien SIMON', 'Miranda MERRON']Louis BURTON
# l_skipper = ['Yannick BESTAVEN']
figure(figsize=(12, 6), dpi=80)
df = df_vendee_prep.loc[df_vendee_prep['Skipper'].isin(l_skipper)]
df['DTF'] = df['DTF'].astype(float)
df['Vitesse_class'] = df['Vitesse_class'].rolling(20).mean().astype(float)
sns.lineplot(data=df, x="DTF", y="Vitesse_class", hue = 'Skipper')
<ipython-input-345-fddf01b68df7>:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-345-fddf01b68df7>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
<AxesSubplot:xlabel='DTF', ylabel='Vitesse_class'>
Les profils de courbe de vitesse des différents skippers sont superposables : globalement tous les skippers sont allés vites et lentement aux mêmes endroits. C'est un nouveau signe que l'édition a dû être disputée : les bateaux devaient être au coude à coude.
On peut remarquer un déphasage de Yannick Bestaven entre 5000 et 13 000 km de l'arrivée, concommitant avec son avance sur le peloton (cf dernière visualisation)
# Afficher le parcours de skippers sur l'intervalle 13 000 - 5 000 km avant l'arrivée
l_skipper = ['Boris HERRMANN', 'Louis BURTON', 'Yannick BESTAVEN']
df = df_vendee_prep[(df_vendee_prep['Skipper'].isin(l_skipper)) & (df_vendee_prep['DTF']>5000) & (df_vendee_prep['DTF'] < 13000)]
df = df.sort_values(by = ['Skipper', 'Date'])
fig = px.line_geo(df,
lat="Lat_float", lon = 'long_float',
color="Skipper", # "continent" is one of the columns of gapminder
projection="orthographic", hover_name = 'DTF', title = 'Parcours de 3 Skippers de 13 000 km à 5 000 km de l arrivee')
fig.show()
L'édition 2020/2021 a vu 6 Skipper abandonner, dont la plupart en peu de temps...
df_RET = df_vendee_prep[df_vendee_prep['Rang'] == 'RET']
df_RET = df_RET.groupby('Date').size().to_frame()
pd.Series(df_vendee_prep['Date'].unique()).to_frame(name='Date').merge(df_RET.reset_index(), how='left', left_on = 'Date', right_on = 'Date').fillna(0.0).set_index('Date').plot()
plt.ylabel('Nombre d abadons')
plt.title("Nombre d'abandons au fur et à mesure de la course")
plt.annotate("Abandons du Cap de Bonne Espérance",
xy=("2020-12-07", 3), xycoords='data',
xytext=("2020-12-10", 1), textcoords='data',
arrowprops=dict(arrowstyle="->",
connectionstyle="arc3"),
)
Text(2020-12-10, 1, 'Abandons du Cap de Bonne Espérance')
# Les abandons : nombre au fur et à mesure de la course et emplacement sur plotly
df = df_vendee_prep[df_vendee_prep['Rang_final'] == 'RET']
df = df.sort_values(by = ['Skipper', 'Date'])
fig = px.line_geo(df,
lat="Lat_float", lon = 'long_float',
color="Skipper", # "continent" is one of the columns of gapminder
projection="orthographic")
fig.show()
La vague d'abandon du Cap de Bonne-Espérance (Afrique du sud) est dû à des problèmes divers, apparemment indépendants.
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML
colors = dict(zip(
df_vendee_prep['RET'].unique(),
['#6699d8', '#FF3333', '#d3dbdc']
))
def draw_barchart(frame):
ax.clear()
df = df_vendee_prep[df_vendee_prep['Date'] == frame]
#### Customiser la fonction couleur ?
c = []
for x in df['RET']:
c.append(colors[x])
ax.barh(df['Skipper'], df['DTF_filled'], color =c, label = c)
for i, (value, name) in enumerate(zip(df['DTF_filled'], df['Skipper'])):
# ax.text(value, i, name, ha='right') # Tokyo: name
# ax.text(value, i-.25, group_lk[name], ha='right') # Asia: group name
ax.text(value, i-0.3, round(round(value, -1)), ha='left')
plt.legend(df['Skipper'])
plt.title("Résumé de la course de Yannick Bestaven")
range_date = iter(df_vendee_prep['Date'].unique())
range_date_restricted = []
cnt = 0
d = 5
for x in df_vendee_prep['Date'].unique():
if cnt == d :
range_date_restricted.append(x)
cnt=0
cnt += 1
range_date_restricted = iter(range_date_restricted)
En abscisse, le nombre de km restants pour chaque skipper, indiqué en ordonnées
fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range_date_restricted)
HTML(animator.to_jshtml())
Yannick Bestaven a mené la course un moment pour finalement se faire doubler au finish ! Il finit néanmoins premier grâce à une compensation en temps pour avoir secouru un autre skipper qui avait fait naufrage. Bravo à lui et bon vent pour la transat Jacques Vabre qui commence aujourd'hui !